import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from umap import UMAP
import spacy
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from glob import glob
def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text)
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
return ' '.join(tokens)
def split_text(text, max_length=100):
words = text.split()
return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
path = r"C:\Users\Cezary\Documents\Monita-privata\data\konferencja-poznan\txt/"
txt_files = [f for f in glob(f"{path}*", recursive=True)]
txt_dict = {}
for txt_file in tqdm(txt_files):
text_key = txt_file.split('\\')[-1].split('.')[0]
with open(txt_file, 'rt', encoding='utf-8') as f:
text_value = f.read()
txt_dict.update({text_key: text_value})
texts = list(txt_dict.values())
processed_texts = [preprocess_text(text) for text in tqdm(texts)]
split_texts = []
for text in tqdm(processed_texts):
split_texts.extend(split_text(text))
split_texts = [text for text in split_texts if text.strip() != '']
if len(split_texts) < 2:
raise ValueError("Niewystarczająca liczba tekstów po przetwarzaniu wstępnym. Dodaj więcej danych wejściowych.")
100%|████████████████████████████████████████████████████████████████████████████████| 322/322 [00:07<00:00, 44.92it/s] 100%|████████████████████████████████████████████████████████████████████████████████| 322/322 [08:53<00:00, 1.66s/it] 100%|██████████████████████████████████████████████████████████████████████████████| 322/322 [00:00<00:00, 4519.01it/s]
stop_words = list(stopwords.words('english'))
# sentence_model = SentenceTransformer("allegro/herbert-base-cased")
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
class EnglishEmbedder(BaseEmbedder):
def __init__(self, embedding_model):
self.embedding_model = embedding_model
def embed(self, documents, verbose=False):
return self.embedding_model.encode(documents, show_progress_bar=verbose)
english_embedder = EnglishEmbedder(sentence_model)
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
topic_model = BERTopic(
embedding_model=english_embedder,
vectorizer_model=vectorizer_model,
umap_model=umap_model,
top_n_words=10,
n_gram_range=(1, 2),
min_topic_size=10,
calculate_probabilities=True
)
try:
topics, probabilities = topic_model.fit_transform(split_texts)
except ValueError as e:
print(f"Error during model fitting: {e}")
print("Texts:", split_texts)
raise
print(topic_model.get_topic_info())
topic_info = topic_model.get_topic_info()
topic_info.to_excel('jojs_topics_info.xlsx', index=False)
Topic Count Name
0 -1 2006 -1_jesuit_de_study_book \
1 0 173 0_music_musical_song_open access
2 1 149 1_chinese_china_ricci_jesuit
3 2 116 2_japanese_japan_xavier_buddhism
4 3 109 3_school_student_teacher_education
.. ... ... ...
81 80 11 80_jesuit_history_society_societys
82 81 11 81_canillac_mission_istanbul_canillacs
83 82 10 82_di_coster_lukács_sodality
84 83 10 83_east africa_solar_heat_geothermal
85 84 10 84_latin_italy_lithuania_vernacular
Representation
0 [jesuit, de, study, book, jesuits, journal, je... \
1 [music, musical, song, open access, open, acce...
2 [chinese, china, ricci, jesuit, matteo, riccis...
3 [japanese, japan, xavier, buddhism, kirishitan...
4 [school, student, teacher, education, teach, j...
.. ...
81 [jesuit, history, society, societys, nierember...
82 [canillac, mission, istanbul, canillacs, arsi ...
83 [di, coster, lukács, sodality, jesuit, non, fl...
84 [east africa, solar, heat, geothermal, energy,...
85 [latin, italy, lithuania, vernacular, spanish ...
Representative_Docs
0 [writing Leonardo Waisman Ant onio Ruiz de Mon...
1 [Landshut ibid K ennedy Jesuit Colleges Chapel...
2 [ysical Spiritual Realms Jesuit historian Henr...
3 [contribute study globalization Jesuit intelle...
4 [fiveplus hour day grammar humanity rhetoric c...
.. ...
81 [hermeneutic historian work Society Jesus toda...
82 [pursuit goal far Canillacs argumentation go s...
83 [study UN Claude p avur ed Ratio Studiorum off...
84 [journal jesuit study UNbrillcomjjs Strzok doi...
85 [discussion impetus rethink book interest link...
[86 rows x 5 columns]
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
topic_model.visualize_barchart()
topic_model.visualize_heatmap()